clear all
set scheme s2mono

*********************************************
****   Description   ************************
*********************************************

* This program generates a figure comparing SY2020 IV coefficients with those generated with a random within branch merge of census and registers data, generating "Figure 9. Replication of SY2020's IV Coefficient Estimates on Permutated Samples with Randomized Depositors". 

* Datasets are pseudo datasets, as a result the figure differs from the one in the paper.

/*-------------------------------------------------------------------------
 * Analysis setup
**------------------------------------------------------------------------- */

global ACCOUNTVAR       `"depositor"'
global CONTROLS1        `"zero_inc"'
global CONTROLS2        `"zero_rp"'
global CLUSTERVAR       `"dist_closest_branch"'
global CLUSTERVAR2       `"fips1870"'
global USEBRANCHWITHIN  50
global INSTRUMENTS      `"dist_closest_branch_early early_branch_county"'
global CONTROLS1b        `"i.occ  i.sy_open_year_cat i.metro "'
global CONTROLS2b        `"citypop male nchlt5 i.ncouples age_census i.related"'



*********************************************
****   Data  ********************************
*********************************************

use "$Data/BankRecordsCensus_Randomized_PseudoData_December2024.dta", clear

encode sy_open_year, gen(sy_open_year_cat)
des sy_open_year sy_open_year_cat
label list sy_open_year_cat


*********************************************
****   SY2020 Estimates Replication  ********
*********************************************


matrix coef_depositor=J(1,5,0)
matrix list coef_depositor

matrix coef_se=J(1,5,0)
matrix list coef_se

matrix f_stat=J(1,1,0)
matrix list f_stat

gen depositor=sy_account_hh

*First stage
reg depositor dist_closest_branch_early early_branch_county

eststo:  ivregress 2sls school  (${ACCOUNTVAR} = ${INSTRUMENTS}) ${CONTROLS1b}  ${CONTROLS2b}    [pw=perwt], cluster(${CLUSTERVAR})

estat firststage 
mat fstat = r(singleresults)
estadd scalar fs = fstat[1,4] 
matrix f_stat[1,1]=e(fs)

 matrix coef_depositor[1,1]=_b[depositor] 
matrix list coef_depositor

 matrix coef_se[1,1]=_se[depositor] 
matrix list coef_se

local i=2
foreach var of varlist lit labforce  {
eststo:  ivregress 2sls `var'    (${ACCOUNTVAR} = ${INSTRUMENTS})   ${CONTROLS1b}  ${CONTROLS2b}  [pw=perwt], cluster(${CLUSTERVAR})

 matrix coef_depositor[1,`i']=_b[depositor] 
matrix list coef_depositor
 matrix coef_se[1,`i']=_se[depositor] 
matrix list coef_se

local i=`i'+1	
}

eststo:  ivregress 2sls loccscore  ${CONTROLS1}  (${ACCOUNTVAR} = ${INSTRUMENTS})  ${CONTROLS1b}  ${CONTROLS2b}     [pw=perwt], cluster(${CLUSTERVAR})


matrix coef_depositor[1,4]=_b[depositor] 
matrix list coef_depositor
matrix coef_se[1,4]=_se[depositor] 
matrix list coef_se

eststo:  ivregress 2sls lrealprop    ${CONTROLS2}  (${ACCOUNTVAR} = ${INSTRUMENTS})  ${CONTROLS1b}  ${CONTROLS2b}   [pw=perwt], cluster(${CLUSTERVAR})

matrix coef_depositor[1,5]=_b[depositor] 
matrix list coef_depositor

matrix coef_se[1,5]=_se[depositor] 
matrix list coef_se


svmat coef_depositor 
svmat coef_se
svmat f_stat

keep if _n==1

keep coef_depositor* coef_se* f_stat

save "$Data/CoefMatrix_SY2020.dta", replace


*********************************************
****   Simulations  ************************* 
*********************************************

use "$Data/BankRecordsCensus_Randomized_PseudoData_December2024.dta", clear

encode sy_open_year, gen(sy_open_year_cat)
des sy_open_year sy_open_year_cat
label list sy_open_year_cat

local nline=500
matrix coef_depositor=J(`nline',5,0)
matrix list coef_depositor

matrix coef_se=J(`nline',5,0)
matrix list coef_se

matrix f_stat=J(`nline',1,0)
matrix list f_stat

forvalues i=1/`nline' {
preserve
*Randomly identify depositors 
gen depositor_i=runiform()<=frequency

*SY2020: Define as account holder any member of the household  
bys hh_id: egen depositor=max(depositor_i)	
tab depositor

*First stage
reg depositor dist_closest_branch_early early_branch_county

eststo:  ivregress 2sls school  (${ACCOUNTVAR} = ${INSTRUMENTS}) ${CONTROLS1b}  ${CONTROLS2b}    [pw=perwt], cluster(${CLUSTERVAR})

estat firststage 
mat fstat = r(singleresults)
estadd scalar fs = fstat[1,4] 
di in red e(fs)

matrix f_stat[`i',1]=e(fs)

 matrix coef_depositor[`i',1]=_b[depositor] 
matrix list coef_depositor

 matrix coef_se[`i',1]=_se[depositor] 
matrix list coef_se

eststo:  ivregress 2sls lit    (${ACCOUNTVAR} = ${INSTRUMENTS})   ${CONTROLS1b}  ${CONTROLS2b}  [pw=perwt], cluster(${CLUSTERVAR})


 matrix coef_depositor[`i',2]=_b[depositor] 
matrix list coef_depositor
 matrix coef_se[`i',2]=_se[depositor] 
matrix list coef_se
 
eststo:  ivregress 2sls labforce  (${ACCOUNTVAR} = ${INSTRUMENTS})  ${CONTROLS1b}  ${CONTROLS2b}    [pw=perwt], cluster(${CLUSTERVAR})



matrix coef_depositor[`i',3]=_b[depositor] 
matrix list coef_depositor
matrix coef_se[`i',3]=_se[depositor] 
matrix list coef_se

eststo:  ivregress 2sls loccscore  ${CONTROLS1}  (${ACCOUNTVAR} = ${INSTRUMENTS})  ${CONTROLS1b}  ${CONTROLS2b}     [pw=perwt], cluster(${CLUSTERVAR})


matrix coef_depositor[`i',4]=_b[depositor] 
matrix list coef_depositor
matrix coef_se[`i',4]=_se[depositor] 
matrix list coef_se

eststo:  ivregress 2sls lrealprop    ${CONTROLS2}  (${ACCOUNTVAR} = ${INSTRUMENTS})  ${CONTROLS1b}  ${CONTROLS2b}   [pw=perwt], cluster(${CLUSTERVAR})

matrix coef_depositor[`i',5]=_b[depositor] 
matrix list coef_depositor

matrix coef_se[`i',5]=_se[depositor] 
matrix list coef_se

estimates clear


restore
}

svmat coef_depositor
svmat coef_se
svmat f_stat


collapse (mean) coef_depositor* f_stat (sd) coef_se1=coef_depositor1 coef_se2=coef_depositor2 coef_se3=coef_depositor3 coef_se4=coef_depositor4 coef_se5=coef_depositor5

save "$Data/CoefMatrix_Random.dta", replace



*********************************************
****   Figure  ******************************
*********************************************

use "$Data/CoefMatrix_Random.dta", clear
append using "$Data/CoefMatrix_SY2020.dta"

list f_stat
keep coef_depositor* coef_se* 
expand 2, generate(group)

forvalues i=1/5 {
	replace coef_depositor`i'=. if group==0
	replace coef_se`i'=. if group==0
}

replace group=4-_n

reshape long coef_depositor coef_se, i(group) j(var)
sort var group

drop if group==3
gen point=coef_depositor 


gen upper90 = point + 1.645*coef_se
gen lower90 = point - 1.645*coef_se

gen row=_n

twoway ///
(rcap upper90 lower90 row, vert) ///
(scatter point row if group == 0, mcolor(red)) /// 
(scatter point row if group == 1, mcolor(blue)) /// 
, xlabel(1.5 "Attended School" 4.5 "Literate" 7.5 "Works" 10.5 "Income" 13.5 "Real Property", angle(0) noticks) ///
legend(row(3) order(2 "SY2020 Depositors" 3 "Randomly Identified Depositors") pos(6)  region(lstyle(none))) /// legend at 6 o'clock position
xtitle(" ") ///
title(" ") ///
ytitle("Coefficient Estimates") 

 
